bring data
library(readr)
data <- read_delim("C:/Users/mkim458/Desktop/data.txt", "\t", escape_double = FALSE, trim_ws = TRUE)
## Parsed with column specification:
## cols(
## .default = col_double(),
## date_time = col_datetime(format = ""),
## site_name = col_character(),
## user_location_country = col_character(),
## user_location_region = col_character(),
## user_location_city = col_character(),
## user_location_latitude = col_character(),
## user_location_longitude = col_character(),
## orig_destination_distance = col_character(),
## srch_ci = col_date(format = ""),
## srch_co = col_date(format = ""),
## hotel_country = col_character(),
## distance_band = col_character(),
## hist_price_band = col_character(),
## popularity_band = col_character()
## )
## See spec(...) for full column specifications.
dest <- read_delim( "C:/Users/mkim458/Desktop/dest.txt", "\t", escape_double = FALSE, trim_ws = TRUE)
## Parsed with column specification:
## cols(
## .default = col_double(),
## srch_destination_name = col_character()
## )
## See spec(...) for full column specifications.
manipulate and random sample data for only US user
detach("package:readr", unload=TRUE)
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.1 v purrr 0.3.2
## v tibble 2.1.1 v dplyr 0.8.0.1
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#data manipulation making ussample of 100000
datanew <- data %>%
select(srch_destination_id, user_location_latitude, user_location_longitude, user_location_country, user_location_region, user_location_city)
destnew <- dest %>%
select(srch_destination_latitude, srch_destination_longitude, srch_destination_id, srch_destination_name)
new <- full_join(datanew, destnew, by = "srch_destination_id")
#chaning the state names to original and the column name to "region"
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
new$user_location_region <- revalue(new$user_location_region, c("AL"="alabama","AK"="alaska","AZ"="arizona","AR"="arkansas","CA"="california","CO"="colorado","CT"="connecticut","DC"="district of coulumbia","DE"="delaware","FL"="florida","GA"="georgia","HI"="hawaii","ID"="idaho","IL"="illinois","IN"="indiana","IA"="iowa","KS"="kansas","KY"="kentucky","LA"="louisiana","ME"="maine","MD"="maryland","MA"="massachusetts","MI"="michigan","MN"="minnesota","MS"="mississippi","MO"="missouri","MT"="montana","NE"="nebraska", "NV"="nevada","NH"="new hampshire","NJ"="new jersey","NM"="new mexico","NY"="new york","NC"="north carolina","ND"="north dakota","OH"="ohio","OK"="oklahoma","OR"="oregon","PA"="pennsylvania","RI"="rhode island","SC"="south carolina","SD"="south dakota","TN"="tennessee","TX"="texas","UT"="utah","VT"="vermont","VA"="virginia","WA"="washington","WV"="west virginia","WI"="wisconsin","WY"="wyoming"))
ussample<-new %>%
filter (user_location_country == "UNITED STATES OF AMERICA")%>%
sample_n(1000)
make csv
#dataset to csv
write.csv(ussample, file = "US_Sample.csv")
#attach csv
ussample <- read.csv("c:/Users/mkim458/Desktop/US_Sample.csv")
count user by state and change column and vector
detach("package:plyr", unload=TRUE)
## Warning: 'plyr' namespace cannot be unloaded:
## namespace 'plyr' is imported by 'ggplot2' so cannot be unloaded
#counting users per region
usercount<- ussample %>%
select(user_location_latitude,user_location_longitude, user_location_region, user_location_city)%>%
group_by(user_location_region)%>%
mutate(Count = n())%>%
filter(user_location_latitude != "NULL")
#change the column name
colnames(usercount)[colnames(usercount)=="user_location_region"]<- "region"
#change to numeric
usercount$user_location_latitude <-as.numeric(usercount$user_location_latitude)
usercount$user_location_longitude <-as.numeric(usercount$user_location_longitude)
Draw US User Map
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.5.3
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(maps)
## Warning: package 'maps' was built under R version 3.5.3
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
#us map with state outline
us <- c(left = -125, bottom = 25.75, right = -67, top = 49)
map <- get_stamenmap(us, zoom = 5, maptype = "toner-lite")
## Source : http://tile.stamen.com/toner-lite/5/4/10.png
## Source : http://tile.stamen.com/toner-lite/5/5/10.png
## Source : http://tile.stamen.com/toner-lite/5/6/10.png
## Source : http://tile.stamen.com/toner-lite/5/7/10.png
## Source : http://tile.stamen.com/toner-lite/5/8/10.png
## Source : http://tile.stamen.com/toner-lite/5/9/10.png
## Source : http://tile.stamen.com/toner-lite/5/10/10.png
## Source : http://tile.stamen.com/toner-lite/5/4/11.png
## Source : http://tile.stamen.com/toner-lite/5/5/11.png
## Source : http://tile.stamen.com/toner-lite/5/6/11.png
## Source : http://tile.stamen.com/toner-lite/5/7/11.png
## Source : http://tile.stamen.com/toner-lite/5/8/11.png
## Source : http://tile.stamen.com/toner-lite/5/9/11.png
## Source : http://tile.stamen.com/toner-lite/5/10/11.png
## Source : http://tile.stamen.com/toner-lite/5/4/12.png
## Source : http://tile.stamen.com/toner-lite/5/5/12.png
## Source : http://tile.stamen.com/toner-lite/5/6/12.png
## Source : http://tile.stamen.com/toner-lite/5/7/12.png
## Source : http://tile.stamen.com/toner-lite/5/8/12.png
## Source : http://tile.stamen.com/toner-lite/5/9/12.png
## Source : http://tile.stamen.com/toner-lite/5/10/12.png
## Source : http://tile.stamen.com/toner-lite/5/4/13.png
## Source : http://tile.stamen.com/toner-lite/5/5/13.png
## Source : http://tile.stamen.com/toner-lite/5/6/13.png
## Source : http://tile.stamen.com/toner-lite/5/7/13.png
## Source : http://tile.stamen.com/toner-lite/5/8/13.png
## Source : http://tile.stamen.com/toner-lite/5/9/13.png
## Source : http://tile.stamen.com/toner-lite/5/10/13.png
#joining state with usercount
states<-map_data("state")
usercountmap <- full_join(states, usercount, by = "region")
## Warning: Column `region` joining character vector and factor, coercing into
## character vector
#plot the map
qplot(long, lat, data=usercountmap, geom="polygon", group=group, fill=Count, main="Expedia Usage by State", xlab="Longitude", ylab="Latitude") + scale_fill_gradient(low = "orange", high="purple")

separate country from srch_destination_name
#separate the city
ussample$srch_destination_name <-as.character(ussample$srch_destination_name)
dest <- list()
dest <- lapply(strsplit(ussample$srch_destination_name, ", "), rev)
for(i in seq_along(ussample$srch_destination_name)){
ussample$srch_destination_name[i] <- dest[[i]][1]
}
#change column name
colnames(ussample)[colnames(ussample)=="srch_destination_name"]<- "region"
counting the search destinations
#sampling
samp <- ussample%>% sample_n(1000)
#count the search destination
srchcount <- samp %>%
group_by(region) %>%
mutate(cnt=n())%>%
select(cnt, srch_destination_latitude, srch_destination_longitude, region)
detach("package:ggmap", unload=TRUE)
detach("package:maps", unload=TRUE)
library(plyr)
## Warning: package 'plyr' was built under R version 3.5.3
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
srchcount$region <- revalue(srchcount$region, c("United States of America"="USA","United Kingdom"="UK"))
Graph Search Destination in World Map
detach("package:plyr", unload=TRUE)
## Warning: 'plyr' namespace cannot be unloaded:
## namespace 'plyr' is imported by 'ggplot2' so cannot be unloaded
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.5.3
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(maps)
## Warning: package 'maps' was built under R version 3.5.3
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
#load world map
world <- map_data("world")
#join the world map with the count
final <- full_join(srchcount, world, by = "region")
final <- final %>% filter(region!= "USA")
#map the world with data
qplot(long, lat, data = final, geom = "polygon", group = group, fill=cnt, main="Most searched Destinations", xlab="Longitude", ylab="Latitude") + scale_fill_gradient(low = "yellow", high="blue")

Graph the search only in Europe
library(ggmap)
library(maps)
OR
eumapf %>%
filter(between(long, -10, 25),
between(lat, 35, 65)) %>%
ggplot(aes(x=long, y=lat, group=group)) +
geom_polygon( aes(fill=cnt)) +
ggtitle("Most Searched European Countries") +
theme(plot.title = element_text(hjust = 0.5))

Graph by order which country was searched the most
sampledest <- destnew %>%
select(srch_destination_name)
sampledest["n"] <- NA
sampledest$n <- 1
sampledest %>%
separate(srch_destination_name, into = c("Street", "City", "region","country"), sep = "," , fill = "left") %>%
mutate(country=as.character(country)) %>%
filter(country != "United States of America") %>%
group_by(country) %>%
summarise(sum = sum(n)) %>%
top_n(30, sum) %>%
ggplot(aes(x = fct_reorder(country, sum), y = sum)) +
geom_col() +
ggtitle("Most Searched Countries") +
theme(plot.title = element_text(hjust = 0.5)) + coord_flip()+labs(y = "Number of Searches", x = "Country")
## Warning: Expected 4 pieces. Additional pieces discarded in 9 rows [19605,
## 22014, 23227, 24185, 29420, 29616, 33101, 33906, 33981].

Looking at distance by family (adults and children) and only adults
distance<-data%>%
select(orig_destination_distance, srch_adults_cnt, srch_children_cnt)%>%
filter(orig_destination_distance != "NA")%>%
filter(orig_destination_distance != "NULL")
distance$X <- NULL
create new column (binary: yes no) children and adult count together –> yes; if no children –> no
distance2 <- distance %>%
mutate(dummy = ifelse(srch_children_cnt ==0, 0, 1))
distance3 <- distance2[sample(1:nrow(distance2), 1000, replace=F),]
distance3$dummy <- as.character(distance3$dummy)
distance3$orig_destination_distance <-as.numeric(distance3$orig_destination_distance)
if dummy is 1
library(plotly)
## Warning: package 'plotly' was built under R version 3.5.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggmap':
##
## wind
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(distance3, x=~dummy, y=~orig_destination_distance, type="box")
distance divided into (1) 0-1000 (2) 1000-2500 (3) 2500-5000 (4) 5000-7500 (5) >7500
distance4 <- distance3 %>%
group_by(orig_destination_distance) %>%
mutate(category=orig_destination_distance)
distance4$category[distance4$orig_destination_distance < 1000] <- "<1000"
distance4$category[distance4$orig_destination_distance >= 1000 & distance4$orig_destination_distance <2500] <- "1000-2500"
distance4$category[distance4$orig_destination_distance >= 2500 & distance4$orig_destination_distance < 5000] <- "2500-5000"
distance4$category[distance4$orig_destination_distance >= 5000 & distance4$orig_destination_distance < 7500] <- "5000-7500"
distance4$category[distance4$orig_destination_distance >= 7500] <- "7500+"
distance5 <- distance4 %>%
group_by(category) %>%
mutate(count=n())
plot_ly(distance5, x = ~category, y =~count, type ="bar")
Hotel Star Rating
p1 <- data %>%
sample_n(10000)
p2 <- p1 %>%
select(prop_starrating, popularity_band, srch_adults_cnt, srch_children_cnt, is_booking) %>%
mutate(totaln = srch_adults_cnt + srch_children_cnt) %>%
filter(totaln == 1 | totaln ==2 | totaln == 3 | totaln == 4) %>%
filter(prop_starrating != 0)
qplot(prop_starrating, geom="histogram", facets=~totaln, fill = popularity_band, data=p2) + labs(x= "Star Rating")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
